In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import ipywidgets as widgets
import plotly.express as px
import matplotlib.pyplot as pp
import seaborn
import matplotlib
import plotly
import plotly.offline as py
import seaborn as sns
import matplotlib.pyplot as plt 
import matplotlib.colors as mcolors
import random
import math
import time
import datetime
import operator
import warnings

from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
from plotly.graph_objs import Scatter, Layout, Figure, Data, Stream, YAxis, Marker, Bar
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error
In [2]:
confirm=pd.read_csv("CovidDataConfirmGlobal.csv")
confirm=confirm.rename(columns={"Country/Region":"country","Province/State":"province"})
data_confirm_global_use1 = pd.read_csv("CovidDataConfirmGlobal.csv", usecols = [0,1,2,3,323])
data_confirm_global_use2= pd.read_csv("CovidDataConfirmGlobal.csv", usecols = [0,1,2,3,322,323])
data_confirm_global_use3 = pd.read_csv('CovidDataConfirmGlobal.csv',usecols = [1,322,323])
df_confirm = pd.read_csv('CovidDataConfirmGlobal.csv')
df_death = pd.read_csv('CovidDataDeathGlobal.csv')
df_confirm = pd.read_csv('CovidDataConfirmGlobal.csv')
df_infection = pd.read_csv('time_series_covid_19_deaths_US.csv')
df_confirm = pd.read_csv('CovidDataConfirmGlobal.csv')
df_recover = pd.read_csv('CovidDataRecoveredGlobal.csv')
confirmed_df = pd.read_csv('time_series_covid_19_confirmed.csv')
deaths_df = pd.read_csv('time_series_covid_19_deaths.csv')
recoveries_df = pd.read_csv('time_series_covid_19_recovered.csv')
In [3]:
cols = [i for i in confirm.columns if i not in ['province', 'country','Lat','Long']]
confirm_date = confirm[cols]
confirm_sum=confirm_date.apply(lambda x:sum(x))
x_data=np.array(list(confirm_date.columns))
y_data=confirm_sum

Changes in number of confirmed cases over time

  • This Figure shows the total confirmed cases from Jan 22 2020 to Dec 6 2020
In [4]:
fig=go.Figure()
fig.add_trace(go.Scatter(x=x_data,y=y_data, mode='lines+markers',
                            name="Confirm",
                            line= dict(color="blue", width=2),
                            text = "Total  confirm:"+ str(y_data[-1])
                            ))
fig.show()
In [5]:
largest_ten_confirm=confirm.nlargest(10, ['12/6/20'])

Top 10 Countries with most confirm cases

In [6]:
fig=px.scatter(largest_ten_confirm,x="country",y="12/6/20",size="12/6/20",color="country",hover_name="country",size_max=100)
fig.show()

Change in cases over time at country level

In [7]:
def plot_for_country(country):
    labels=["Confirm"]
    colors=["red"]
    mode_size=[6,8]
    line_size=[4,5]
    
    df_list=[confirm]
    fig=go.Figure()
    for i, df in enumerate(df_list):
        x_data=np.array(list(df.iloc[:,4:].columns))
        y_data=np.sum(np.asarray(df[df['country']==country].iloc[:,4:]),axis=0)
    fig.add_trace(go.Scatter(x=x_data,y=y_data, mode='lines+markers',
                            name=labels[i],
                            line= dict(color=colors[i], width=line_size[i]),
                            connectgaps=True,
                                      text = country +"Total"+ str(labels[i])+ ":"+ str(y_data[-1])
                            ))
    fig.show()


interact(plot_for_country,country="World")
Out[7]:
<function __main__.plot_for_country(country)>

Latest Total Confrim Number throughout the world until 12/06/20

In [8]:
cols = list(data_confirm_global_use1)
cols.insert(0,cols.pop(cols.index('Country/Region')))
data_confirm_global_use1 = data_confirm_global_use1.loc[:,cols]

col_name = ['Country', 'Province','Lat','Long','cases']

data_confirm_global_use1.columns =col_name

data_confirm_global_use1['Province'] = data_confirm_global_use1['Province'].fillna('Null')
In [9]:
fig = px.scatter_mapbox(data_confirm_global_use1,
                        lat="Lat", lon = "Long",
                        zoom=1,
                        hover_data=["Country","Province", "cases"],   
                        mapbox_style="carto-positron",
                        range_color= [0, 100],title='Latest Number of Total Confirmed Cases throughout the World')
fig.update_layout(margin={"r":0,"t":30,"l":0,"b":0},
                 mapbox = {
                            
                           },)
fig.show()
In [10]:
cols2 = list(data_confirm_global_use2)
cols2.insert(0,cols2.pop(cols2.index('Country/Region')))
data_confirm_global_use2 = data_confirm_global_use2.loc[:,cols2]

col_name2 = ['Country', 'Province','Lat','Long','cases1','cases2']

data_confirm_global_use2.columns =col_name2

data_confirm_global_use2["cases1"] = data_confirm_global_use2["cases2"] - data_confirm_global_use2["cases1"]

data_confirm_global_use2 = data_confirm_global_use2.drop(['cases2'], axis=1)

data_confirm_global_use2 = data_confirm_global_use2.rename(columns={'cases1':'cases'})

data_confirm_global_use2['Province'] = data_confirm_global_use2['Province'].fillna('Null')

Latest Confrimed Number throughout the world as of 12/06/20

In [11]:
fig2 = px.scatter_mapbox(data_confirm_global_use2,
                        lat="Lat", lon = "Long",
                        zoom=1,
                        hover_data=["Country","Province", "cases"],   
                        mapbox_style="carto-positron",
                        range_color= [0, 100],title='Latest Number of Confirmed Cases throughout the World')
fig2.update_layout(margin={"r":0,"t":30,"l":0,"b":0},
                 mapbox = {
                            
                           },)
fig2.show()
In [12]:
col_name = ['Country','12/5/20','12/6/20']
data_confirm_global_use3.columns =col_name
data_confirm_global_compare=data_confirm_global_use3.groupby(['Country'], sort=False,as_index=False).sum()


data_confirm_global_compare["cases"] = data_confirm_global_compare["12/6/20"] - data_confirm_global_compare["12/5/20"]
data_confirm_global_compare = data_confirm_global_compare.drop(['12/6/20','12/5/20'], axis=1)

data_confirm_global_compare = data_confirm_global_compare.sort_values(by='cases',  ascending=True)
data_confirm_global_compare = data_confirm_global_compare.tail(50)

Comparison of new cases on Dec 6 2020

In [13]:
data  = go.Bar(
                x = data_confirm_global_compare.cases,
                y = data_confirm_global_compare.Country,
                orientation='h')
             
layout = go.Layout(
        height = 1000,
        title = "Comparsion of new cases on 2020/ 12/ 6"
)
fig  = go.Figure(data=data, layout=layout)
py.iplot(fig)

Top 10 Deaths Pie Chart as of 12/2/2020

In [14]:
df_1 = df_death.drop(['Province/State', 'Lat','Long'], axis=1)
df_sort=df_1.groupby(['Country/Region'],as_index=False, sort=False).sum()
df_sort1 = df_sort[['Country/Region', '12/2/20']]
df_a=df_sort1.sort_values(by='12/2/20', ascending=False).head(10)
labels = df_a['Country/Region']
size = df_a['12/2/20']
In [15]:
plt.pie(size,labels=labels,autopct='%.2f')
plt.title("Top 10 Deaths Pie Chart as of 12/2/2020")
plt.show()
In [16]:
df_sort=df_1.groupby(['Country/Region'],as_index=False, sort=False).sum()
df_sort1 = df_sort[['Country/Region', '12/2/20']]
df_a=df_sort1.sort_values(by='12/2/20', ascending=False).head(30)
labels = df_a['Country/Region']
size = df_a['12/2/20']

Top 30 Deaths Bar Chart as of 12/2/2020

In [17]:
f, ax = plt.subplots(figsize=(10, 20))
sns.barplot(x="12/2/20", y="Country/Region", data=df_a, orient="h")
ax.set(xlabel='Top 30 Deaths as of 12/2/20', ylabel='Country')
plt.show()

Top 10 Confirm Pie Chart as of 12/2/2020

In [18]:
df_d = df_confirm.drop(['Province/State','Lat','Long'], axis=1)
df_sort=df_confirm.groupby(['Country/Region'],as_index=False, sort=False).sum()
df_sort1 = df_sort[['Country/Region', '12/2/20']]
df_a=df_sort1.sort_values(by='12/2/20', ascending=False).head(10)
labels = df_a['Country/Region']
size = df_a['12/2/20']
In [19]:
plt.pie(size,labels=labels,autopct='%.2f')
plt.title("Top 10 Confirm Pie Chart as of 12/2/2020")
plt.show()
In [20]:
df_sort=df_confirm.groupby(['Country/Region'],as_index=False, sort=False).sum()
df_sort1 = df_sort[['Country/Region', '12/2/20']]
df_a=df_sort1.sort_values(by='12/2/20', ascending=False).head(30)
labels = df_a['Country/Region']
size = df_a['12/2/20']

Top 30 Confirm Bar Chart as of 12/2/2020

In [21]:
f, ax = plt.subplots(figsize=(10, 20))
sns.barplot(x="12/2/20", y="Country/Region", data=df_a, orient="h")
ax.set(xlabel='Top 30 Confirmed Cases as of 12/2/20', ylabel='Country')
plt.show()

Percentage of Death Rate over time

In [22]:
df_death_1 = df_death.drop(['Province/State', 'Lat','Long'], axis=1)
df_confirm_1 = df_confirm.drop(['Province/State', 'Lat','Long'], axis=1)
df_2=df_death_1.groupby(['Country/Region'], sort=False).sum()
df_3=df_confirm_1.groupby(['Country/Region'], sort=False).sum()
death_rate = df_2 / df_3
Death_rate = death_rate.fillna(0)
death = Death_rate.loc[["Australia","China","Canada","Malaysia","US","Japan"],:]
death1 = death.iloc[:,300:320]
death1 = death1.transpose()
death1 = death1 * 100
In [23]:
fig, ax = plt.subplots()
fig.set_size_inches(18.5, 10.5)
sns.lineplot(data=death1)
ax.set(xlabel='Date', ylabel='Percentage of Death Rate')
plt.show()

Percentage of Infection Rate for US State level over time

In [24]:
df_infection_1 = df_infection.drop(['UID', 'iso2','iso3','code3','FIPS','Admin2','Country_Region','Lat','Long_','Combined_Key'], axis=1)
df_2=df_infection_1.groupby(['Province_State'], sort=False).sum()
df_2.head(40)
df_2.drop(df_2.loc[df_2['Population']==0].index, inplace=True)
df_2.iloc[0:,1:]
Infection_Death_Rate_of_Population = df_2.iloc[0:,1:].div(df_2.Population, axis=0)
Infection_Death_Rate_of_Population = Infection_Death_Rate_of_Population.fillna(0)
Infection = Infection_Death_Rate_of_Population.loc[["Texas","Wisconsin","California","Virginia","Washington","New York"],:]
Infection1 = Infection.iloc[:,300:320]
Infection1 = Infection1.transpose()
Infection1 = Infection1 * 100
In [25]:
fig, ax = plt.subplots()
fig.set_size_inches(18.5, 10.5)
sns.lineplot(data=Infection1)
ax.set(xlabel='Date', ylabel='Percentage of Infection Rate')
plt.show()

Percentage of Recover Rate for certain countries over time

In [26]:
df_confirm_1 = df_confirm.drop(['Province/State', 'Lat','Long'], axis=1)
df_recover_1 = df_recover.drop(['Province/State', 'Lat','Long','12/7/20','12/8/20','12/9/20','12/10/20','12/11/20'], axis=1)
df_3=df_confirm_1.groupby(['Country/Region'], sort=False).sum()
df_4=df_recover_1.groupby(['Country/Region'], sort=False).sum()
recover_rate = df_4 / df_3
recover_rate = recover_rate.fillna(0)
recover = recover_rate.loc[["Australia","China","Canada","Malaysia","US","Japan"],:]
recover1 = recover.iloc[:,300:320]
recover1 = recover1.transpose()
In [27]:
fig, ax = plt.subplots()
fig.set_size_inches(18.5, 10.5)
sns.lineplot(data=recover1)
ax.set(xlabel='Date', ylabel='Recovery Rate')
plt.show()
In [28]:
plt.style.use('seaborn')
%matplotlib inline 
In [29]:
columns = confirmed_df.keys()
In [30]:
confirmed = confirmed_df.loc[:, columns[4]:columns[-1]]
deaths = deaths_df.loc[:, columns[4]:columns[-1]]
recoveries = recoveries_df.loc[:, columns[4]:columns[-1]]
In [31]:
dates = confirmed.keys()
world_cases = []
total_deaths = [] 
mortality_rate = []
total_recovered = [] 

for i in dates:
    confirmed_sum = confirmed[i].sum()
    death_sum = deaths[i].sum()
    recovered_sum = recoveries[i].sum()
    world_cases.append(confirmed_sum)
    total_deaths.append(death_sum)
    mortality_rate.append(death_sum/confirmed_sum)
    total_recovered.append(recovered_sum)
In [32]:
days_since_1_22 = np.array([i for i in range(len(dates))]).reshape(-1, 1)
world_cases = np.array(world_cases).reshape(-1, 1)
total_deaths = np.array(total_deaths).reshape(-1, 1)
total_recovered = np.array(total_recovered).reshape(-1, 1)
In [33]:
days_in_future = 15
future_forcast = np.array([i for i in range(len(dates)+days_in_future)]).reshape(-1, 1)
adjusted_dates = future_forcast[:-15]
In [34]:
start = '1/22/2020'
start_date = datetime.datetime.strptime(start, '%m/%d/%Y')
future_forcast_dates = []
for i in range(len(future_forcast)):
    future_forcast_dates.append((start_date + datetime.timedelta(days=i)).strftime('%m/%d/%Y'))
In [35]:
X_train_confirmed, X_test_confirmed, y_train_confirmed, y_test_confirmed = train_test_split(days_since_1_22, world_cases, test_size=0.15, shuffle=False) 
In [36]:
linear_model = LinearRegression(normalize=True, fit_intercept=True)
linear_model.fit(X_train_confirmed, y_train_confirmed)
test_linear_pred = linear_model.predict(X_test_confirmed)
linear_pred = linear_model.predict(future_forcast)
In [37]:
tol = [1e-4, 1e-3, 1e-2]
alpha_1 = [1e-7, 1e-6, 1e-5, 1e-4]
alpha_2 = [1e-7, 1e-6, 1e-5, 1e-4]
lambda_1 = [1e-7, 1e-6, 1e-5, 1e-4]
lambda_2 = [1e-7, 1e-6, 1e-5, 1e-4]

bayesian_grid = {'tol': tol, 'alpha_1': alpha_1, 'alpha_2' : alpha_2, 'lambda_1': lambda_1, 'lambda_2' : lambda_2}

bayesian = BayesianRidge()
bayesian_search = RandomizedSearchCV(bayesian, bayesian_grid, scoring='neg_mean_squared_error', cv=3, return_train_score=True, n_jobs=-1, n_iter=40, verbose=1)
bayesian_search.fit(X_train_confirmed, y_train_confirmed)
warnings.filterwarnings('ignore')
Fitting 3 folds for each of 40 candidates, totalling 120 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    1.5s finished
C:\Users\Jimmy\anaconda3\lib\site-packages\sklearn\utils\validation.py:73: DataConversionWarning:

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().

In [38]:
bayesian_confirmed = bayesian_search.best_estimator_
test_bayesian_pred = bayesian_confirmed.predict(X_test_confirmed)
bayesian_pred = bayesian_confirmed.predict(future_forcast)
In [39]:
plt.figure(figsize=(20, 12))
plt.plot(adjusted_dates, world_cases)
plt.title('Total Confirm of Coronavirus Cases Over Time', size=30)
plt.xlabel('Days Since 1/22/2020', size=30)
plt.ylabel('Total Confirm of Cases', size=30)
plt.xticks(size=15)
plt.show()
In [40]:
plt.figure(figsize=(20, 12))
plt.plot(adjusted_dates, world_cases)
plt.plot(future_forcast, linear_pred, linestyle='dashed', color='orange')
plt.title('Linear Regression Prediction of Coronavirus Cases Over Time', size=30)
plt.xlabel('Days Since 1/22/2020', size=30)
plt.ylabel('Linear Regression Prediction of Cases', size=30)
plt.legend(['Confirmed Cases', 'Linear Regression Predictions'])
plt.xticks(size=15)
plt.show()
In [41]:
plt.figure(figsize=(20, 12))
plt.plot(adjusted_dates, world_cases)
plt.plot(future_forcast, bayesian_pred, linestyle='dashed', color='green')
plt.title('Bayesian Ridge Regression Prediction of Coronavirus Cases Over Time', size=30)
plt.xlabel('Time', size=30)
plt.ylabel('Bayesian Ridge Regression Prediction of Cases', size=30)
plt.legend(['Confirmed Cases', 'Bayesian Ridge Regression Predictions'])
plt.xticks(size=15)
plt.show()
In [42]:
plt.figure(figsize=(10, 7))
plt.plot(adjusted_dates, total_deaths, color='r')
plt.plot(adjusted_dates, total_recovered, color='green')
plt.legend(['Deaths', 'Recoveries'], loc='best', fontsize=20)
plt.title('Death and Recoveries of Coronavirus Cases', size=20)
plt.xlabel('Time', size=20)
plt.ylabel('Deaths and Recovery of Cases', size=20)
plt.xticks(size=15)
plt.show()
In [ ]: